./../SFARI_genes_01-15-2019.csv of SFARI genes information downloaded from https://gene.sfari.org/database/gene-scoring/. Downloaded version 01-15-19.
SFARI_scraping/genes/gene_name.csv file for each of the SFARI genes with the information of all the reports related to it obtained using the SFARI_scraper.py script and scrapy package.
MeSH_terms_by_paper.json file with all the MeSH terms associated to each of the articles from the SFARI_scraper output files. Retrieved from the NCBI e-utils api using the get_articles_metadata.R script and reutils package. Downloaded on 24/05/19.
# Read json with MeSH terms by paper
mesh_terms_by_article_list = read_json('./../Data/MeSH_terms_by_paper.json', simplifyVector = TRUE)
Remove subheadings/qualifiers
Separate MeSH terms with commas into two different terms and keep only the last, most specific, one
qualifiers = c('abnormalities', 'administration & dosage', 'adverse effects', 'analogs & derivatives', 'analysis',
'anatomy & histology', 'antagonists & inhibitors', 'biosynthesis', 'blood supply', 'cerebrospinal fluid',
'chemical synthesis', 'chemically induced', 'classification', 'complications', 'congenital',
'cytology', 'deficiency', 'diagnosis', 'diagnostic imaging', 'diet therapy', 'drug effects', 'drug therapy',
'economics', 'education', 'embryology', 'enzymology', 'epidemiology', 'ethics', 'ethnology', 'etiology',
'growth & development', 'history', 'immunology', 'injuries', 'innervation', 'instrumentation',
'isolation & purification', 'legislation & jurisprudence', 'manpower', 'metabolism', 'methods', 'microbiology',
'mortality', 'nursing', 'organization & administration', 'parasitology', 'pathology', 'pharmacokinetics',
'pharmacology', 'physiology', 'physiopathology', 'poisoning', 'prevention & control', 'psychology',
'radiation effects', 'radiotherapy', 'rehabilitation', 'secondary', 'secretion', 'standards',
'statistics & numerical data', 'supply & distribution', 'surgery', 'therapeutic use', 'therapy', 'toxicity',
'transmission', 'trends', 'ultrastructure', 'urine', 'utilization', 'veterinary', 'virology')
qualifiers_all = c(qualifiers, 'agonists', 'blood', 'chemistry', 'genetics')
qualifiers_regex = paste(qualifiers, collapse='|')
for(article in names(mesh_terms_by_article_list)){
mesh_terms = mesh_terms_by_article_list[[article]]
new_mesh_terms = c()
if(!is.na(mesh_terms[1])){
for(mesh_term in mesh_terms){
# Look for simple qualifiers and delete them
match_general = grepl(qualifiers_regex, mesh_term)
match_agonists = grepl('agonists', mesh_term) & !grepl('ntagonists', mesh_term)
match_blood = grepl('blood', mesh_term) & !grepl('blood supply', mesh_term)
match_chemistry = grepl('chemistry', mesh_term) & !grepl('immunohistochemistry', mesh_term)
match_genetics = grepl('genetics', mesh_term) & !grepl('Xgenetics', mesh_term)
if(match_agonists) mesh_term = gsub('agonists', '', mesh_term)
if(match_blood) mesh_term = gsub('blood', '', mesh_term)
if(match_chemistry) mesh_term = gsub('chemistry', '', mesh_term)
if(match_genetics) mesh_term = gsub('genetics', '', mesh_term)
if(match_general) mesh_term = gsub(qualifiers_regex, '', mesh_term)
# Separate MeSH terms by commas
mesh_term = mesh_term %>% strsplit(', ') %>% unlist
mesh_term = mesh_term[length(mesh_term)] # keep last term
# Update
new_mesh_terms = c(new_mesh_terms, mesh_term)
}
# Update article entries
mesh_terms_by_article_list[[article]] = unique(new_mesh_terms)
}
}
rm(qualifiers, qualifiers_all, qualifiers_regex, article, mesh_term, mesh_terms, new_mesh_terms,
match_general, match_agonists, match_blood, match_genetics)
# Get list of all mesh terms
all_mesh_terms = c()
n = 0
for(mesh_terms_in_article in mesh_terms_by_article_list){
all_mesh_terms = unique(c(all_mesh_terms, mesh_terms_in_article))
n = c(n, length(all_mesh_terms))
}
mesh_terms_counts = data.frame('articles' = seq(1,length(n)), 'mesh_terms'=n)
print(paste0('Articles: ', length(mesh_terms_by_article_list), ' Mesh terms: ', length(all_mesh_terms)))
## [1] "Articles: 3709 Mesh terms: 4061"
ggplotly(mesh_terms_counts %>% ggplot(aes(articles, mesh_terms, color='#434343')) + geom_line() +
geom_smooth(method='lm', color='#666666', se=FALSE, size=0.5) + theme_minimal() + theme(legend.position='none') +
ggtitle('Increase in number of MeSH terms by each new article'))
rm(mesh_terms_in_article, n, mesh_terms_counts)
all_articles = names(mesh_terms_by_article_list)
article_mesh_term_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_mesh_terms))))
rownames(article_mesh_term_mat) = all_articles
colnames(article_mesh_term_mat) = all_mesh_terms[!is.na(all_mesh_terms)]
for(article in all_articles){
article_mesh_terms = mesh_terms_by_article_list[[article]]
if(sum(is.na(article_mesh_terms))==0){
article_mesh_term_mat[article, article_mesh_terms] = 1
}
}
rm(mesh_terms_by_article_list, article, article_mesh_terms)
An important proportion of articles (almost 10%) don’t have MeSH terms
mesh_terms_by_article = data.frame('id' = rownames(article_mesh_term_mat), 'n_mesh_terms' = rowSums(article_mesh_term_mat))
bins = max(mesh_terms_by_article$n_mesh_terms)-min(mesh_terms_by_article$n_mesh_terms)+1
ggplotly(mesh_terms_by_article %>% ggplot(aes(n_mesh_terms)) + geom_histogram(bins=bins, alpha=0.6) +
ggtitle('Distribution of number of MeSH terms by article') + theme_minimal())
rm(bins)
Most MeSH terms are present in only a few articles
articles_by_mesh_term = data.frame('id' = as.character(colnames(article_mesh_term_mat)),
'n_articles' = colSums(article_mesh_term_mat), stringsAsFactors = FALSE)
bins = max(articles_by_mesh_term$n_articles)-min(articles_by_mesh_term$n_articles)+1
ggplotly(articles_by_mesh_term %>% ggplot(aes(n_articles)) + geom_histogram(bins=bins, alpha=0.6) +
scale_x_sqrt() + scale_y_sqrt() + theme_minimal() +
ggtitle('Distribution of number of articles that contain each MeSH term'))
rm(bins)
Many of the most popular MeSH terms aren’t useful
top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('Top ', top_n_counts, ' MeSH Terms')) +
xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(mesh_terms_by_article, bins)
## Warning in rm(mesh_terms_by_article, bins): object 'bins' not found
keep_articles = rowSums(article_mesh_term_mat)>0
print(paste0(sum(!keep_articles), '/', nrow(article_mesh_term_mat), ' articles have no MeSH terms'))
## [1] "363/3709 articles have no MeSH terms"
keep_mesh_terms = colSums(article_mesh_term_mat)>1
print(paste0(sum(!keep_mesh_terms), '/', ncol(article_mesh_term_mat), ' MeSH terms are mentioned in a single article'))
## [1] "1662/4060 MeSH terms are mentioned in a single article"
article_mesh_term_mat = article_mesh_term_mat[keep_articles, keep_mesh_terms]
articles_by_mesh_term = articles_by_mesh_term %>% filter(id %in% colnames(article_mesh_term_mat))
print(paste0('Remaining Articles: ', nrow(article_mesh_term_mat), ' Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Remaining Articles: 3346 Mesh terms: 2398"
rm(keep_articles, keep_mesh_terms)
keep_terms = c('abnormalit', 'acid', 'aging', 'allele', 'amin', 'analgesic', 'analysis', 'ancestry', 'astrocyte',
'axon', 'behaviour', 'binding', 'biomark', 'blot', 'brain', 'case-control', 'cell', 'cereb',
'channel', 'chemistry', 'chromatin', 'chromosome', 'circadian', 'clon', 'cluster', 'codon',
'cognit', 'cohort', 'consanguinity', 'cortex', 'cpg', 'culture', 'dendrit', 'develop',
'disease', 'disorder', 'dna', 'dopamin', 'drug', 'electro', 'enzym', 'epilepsy', 'etiology',
'exome', 'exon', 'fibroblast', 'fluorescence', 'gabapentin', 'gen', 'haplo', 'heart', 'hippocamp',
'histon', 'hypothalamus', 'imaging', 'immun', 'in situ', 'in vitro', 'intron', 'kinase', 'knockout',
'link', 'megalencephaly', 'membran', 'mental', 'messenger', 'metabo', 'methyl', 'microcephaly',
'microscop', 'microtubules', 'missense', 'model', 'molecul', 'muta', 'neuro', 'nonsense', 'nucleotide',
'oxytocin', 'pair', 'pathway', 'peptid', 'pervasive', 'phenotype', 'phospor', 'polymerase',
'polymorphism', 'promoter', 'protein', 'psychiatric', 'receptor', 'recessive', 'region', 'regulat',
'risk', 'rna', 'seizure', 'sequence', 'serotonin', 'sibling', 'signal', 'spectrometry',
'splicing', 'statistics', 'striatum', 'synap', 'technique', 'trait loci', 'transcript',
'transfection', 'transloc', 'tumor', 'vasopressin', 'zygote')
articles_by_mesh_term = articles_by_mesh_term %>%
mutate('keep'=ifelse(grepl(paste(keep_terms,collapse='|'), tolower(id)), TRUE, FALSE)) %>%
filter(keep)
article_mesh_term_mat = article_mesh_term_mat %>% dplyr::select(articles_by_mesh_term$id)
article_mesh_term_mat = article_mesh_term_mat[rowSums(article_mesh_term_mat)>0, ]
print(paste0('Articles: ', nrow(article_mesh_term_mat), ' Mesh terms: ', ncol(article_mesh_term_mat)))
## [1] "Articles: 3344 Mesh terms: 1147"
rm(keep_terms)
top_n_counts = 50
top_mesh_terms = articles_by_mesh_term %>% top_n(top_n_counts, wt=n_articles)
ggplotly(top_mesh_terms %>% ggplot(aes(reorder(id, n_articles), n_articles, fill=n_articles)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('New top ', top_n_counts, ' MeSH Terms')) +
xlab('') + ylab('No. Articles') + theme_minimal() + theme(legend.position='none'))
rm(top_n_counts, top_mesh_terms)
sparse_article_mesh_term_mat = Matrix(as.matrix(article_mesh_term_mat), sparse=TRUE)
# EDGES
mesh_mesh_mat = t(sparse_article_mesh_term_mat) %*% sparse_article_mesh_term_mat
edges = summary(mesh_mesh_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
mutate(from = colnames(article_mesh_term_mat)[from],
to = colnames(article_mesh_term_mat)[to]) %>%
filter(from!=to)
# Convert count to fraction
article_mesh_term_colsums = colSums(article_mesh_term_mat)
names(article_mesh_term_colsums) = colnames(article_mesh_term_mat)
edges = edges %>% mutate('weight'=count/(article_mesh_term_colsums[from]+article_mesh_term_colsums[to]))
# NODES
nodes = articles_by_mesh_term %>% mutate('id'=as.character(id), 'title'=as.character(id), 'value'=n_articles)
print(paste0('Nodes: ', nrow(nodes), ' Edges: ', nrow(edges)/2))
## [1] "Nodes: 1147 Edges: 50085"
graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')
eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
plot_data = data.frame('id'=graph %>% get.vertex.attribute('name'), 'eigencentrality'=eigencentr) %>%
top_n(50, wt=eigencentrality)
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=eigencentrality)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('MeSH terms with the highest Network centrality')) +
xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)
comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership
color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr) %>%
set_vertex_attr('module', value=as.numeric(modules)) %>%
set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
set_vertex_attr('color', value=color_pal[as.numeric(modules)])
comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') +
ggtitle('Number of MeSH Terms in each module') + theme_minimal() + theme(legend.position = 'none'))
visIgraph(graph) %>% visOptions(selectedBy='module', highlightNearest=TRUE) %>%
visIgraphLayout(randomSeed=123)
rm(color_pal, eigen_centrality_output, eigencentr, comm_sizes, edges, nodes)
Most important MeSH terms for modules with more than 50 nodes
module_info = tibble('module'=character(), 'top_term'=character(), size=integer())
for(i in names(sizes(comms_louvain))){
module_vertices = names(modules)[modules==i]
# Creat subgraph
module_graph = induced_subgraph(graph, module_vertices)
# Calculate eigencentrality
eigen_centrality_output = eigen_centrality(module_graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
names(eigencentr) = module_vertices
module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1],
'size'=length(module_vertices))
if(length(module_vertices)>50){
# Plot eigencentrality of top 10 MeSH terms
plot_data = data.frame('mesh_term'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
top_n(10, wt=eigencentrality)
print(plot_data %>% ggplot(aes(reorder(mesh_term, eigencentrality), eigencentrality, fill=eigencentrality)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('Top terms for Module ', i)) +
xlab('MeSH Terms') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
}
}
rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, comms_louvain)
Collapsing edges: sum all original weights and divide by the product of the number of elements in both (the number of possible interactions they could have)
The Network plot is not that informative
module_graph = contract(graph, graph %>% get.vertex.attribute('module'), vertex.attr.comb='first') %>%
simplify(edge.attr.comb='sum') %>% set_vertex_attr('name', value=module_info$module) %>%
set_vertex_attr('title', value=module_info$top_term) %>%
set_vertex_attr('module_size', value=module_info$size) %>%
set_vertex_attr('value', value=module_info$size)
nodes_module_graph = module_graph %>% as_data_frame(what='vertices') %>% mutate('id'=name)
edges_module_graph = module_graph %>% as_data_frame() %>%
mutate('corrected_weight' = weight/(nodes_module_graph$module_size[as.numeric(from)]*
nodes_module_graph$module_size[as.numeric(to)])*100)
module_graph = module_graph %>% set_edge_attr('weight', value=edges_module_graph$corrected_weight)
visIgraph(module_graph) %>% visIgraphLayout(randomSeed=123) %>% visOptions(highlightNearest=TRUE)
rm(module_info, nodes_module_graph, edges_module_graph)
adj_mat = module_graph %>% as_adjacency_matrix(attr='weight') %>% as.matrix
adj_mat = adj_mat
heatmap.2(adj_mat, cellnote=round(adj_mat,2), dendrogram='none', notecol='gray', scale='none', trace='none',
key=FALSE, xlab='Modules', ylab='Modules', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
rm(adj_mat)
Save file with the modules each paper belongs to through its mesh terms
mesh_term_module = graph %>% as_data_frame(what='vertices') %>% dplyr::select(name, module)
mesh_term_module_mat = matrix(0, nrow(mesh_term_module), max(mesh_term_module$module))
rownames(mesh_term_module_mat) = mesh_term_module$name
colnames(mesh_term_module_mat) = sort(unique(mesh_term_module$module))
for(i in seq(1, nrow(mesh_term_module))){
mesh_term_module_mat[i,mesh_term_module[i,2]] = 1
}
if(!all(colnames(article_mesh_term_mat) == rownames(mesh_term_module_mat))){
print('Row and column order dont match!')
}
article_module_mat = as.matrix(article_mesh_term_mat) %*% mesh_term_module_mat
rownames(article_module_mat) = rownames(article_mesh_term_mat)
write.csv(article_module_mat, file='./../Data/article_module_matrix_wo_qualifiers_specific.csv')
print('Distribution of number of modules per article')
## [1] "Distribution of number of modules per article"
table(apply(article_module_mat, 1, function(x) sum(x>0)))
##
## 1 2 3 4 5 6
## 731 1245 842 374 122 30
melt_article_module = melt(article_module_mat)
colnames(melt_article_module) = c('articleID', 'module', 'value')
melt_article_module$module = as.factor(melt_article_module$module)
ggplotly(melt_article_module %>% ggplot(aes(value, fill=module, color=module)) +
geom_histogram(position='identity', bins=max(melt_article_module$value), alpha=0.5) +
facet_grid(module~., scales='free_y') + ggtitle('Module content distribution per article') +
xlab('n_articles') + ylab('Module content') + theme_minimal() + theme(legend.position='none'))
rm(mesh_term_module, mesh_term_module_mat, i)